%%capture
# Importing packages and functions we need
import sys
# Including cormerant
import cormerant as cmt
# Loading pandas to access the metadata
import pandas as pd
# Using importlib to update the package as we make changes
import importlib
import os
sys.path.append('../../')
os.chdir('/Users/DouglasHannumJr/Downloads/mouse_hypothalamus_200um//')
os.listdir()
['barcodes.csv', 'barcodes_per_feature.csv', 'ExportBarcodes', 'ExportCellBoundaries', 'ExportCellMetadata3D', 'ExportPartitionedBarcodes', 'feature_boundaries.pkl', 'feature_metadata.csv', 'md_updated.csv', 'retained_feature_boundaries.parquet', 'seurat.rds', 'temp_cellBoundaries_with_md_sub.parquet']
import json
from copy import deepcopy
import numpy as np
import pandas as pd
import scanpy as sc
from clustergrammer2 import CGM2, Network
from IPython.display import HTML, display
from matplotlib import pyplot as plt
from observable_jupyter import embed
from observable_jupyter_widget import ObservableWidget
# For plot_3D
import rasterio
import pyarrow.parquet as pq
import geopandas as gpd
from shapely import wkt, wkb
import shapely
from rasterio.windows import Window
from rasterio.enums import Resampling
# Stops the common warning when reading in images used in
# the plot_3d_* functions.
import warnings
warnings.filterwarnings("ignore", category=rasterio.errors.NotGeoreferencedWarning)
from PIL import Image
import base64
from io import BytesIO
import s3fs
import polars as pl
>> clustergrammer2 backend version 0.18.0
C:\Users\DouglasHannumJr\AppData\Roaming\Python\Python310\site-packages\geopandas\_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.4-CAPI-1.16.2). Conversions between both will be slow. warnings.warn( C:\Users\DouglasHannumJr\AppData\Local\Temp\ipykernel_4364\1441961771.py:16: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas: import os os.environ['USE_PYGEOS'] = '0' import geopandas In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html). import geopandas as gpd
import pickle
import pandas
x = pd.read_pickle('feature_boundaries.pkl')
x.to_parquet('./feature_boundaries.parquet')
md = pd.read_csv('./md_updated.csv')
retained = x[x['id'].isin(md['name'])]
md.shape
x.shape
retained.shape
retained.to_parquet('./retained_feature_boundaries.parquet')
md = pd.read_csv('./md_updated.csv', index_col=0)
md[['global.x','global.y']].describe()
md[(md['global.x'] < 10) & (md['global.x'] > -10) & (md['global.y'] > 240) & (md['global.y'] < 260)].iloc[1,:]
directory = './'
cell_idx = '1634e56c0fd840998443f4d10b213647'
verbose = True
metadata_filename = 'md_updated.csv'
cell_categories = 'cell.type'
window_size = 200
max_number_of_cells = 100000
# z_index_number = 3
local = True
# pl_batch_size = 500000
# image_type = 'DAPI'
# image_file = 'images/mosaic_DAPI_z3.tif'
# transformation_file = 'images/micron_to_mosaic_pixel_transform.csv'
cell_boundaries_file = 'retained_feature_boundaries.parquet'
cell_id_name = 'name'
transcripts_file = 'barcodes.csv'
remove_blanks = True
z_distance = 1
cell_colors = None
init_gene = 'none'
observable_url = '@cellvisulization/2-3-3-deck-gl-image-polygon-scatter-3d'
md = pd.read_csv('./md_updated.csv', index_col=0)
cell = md.loc[cell_idx]
window = [cell['global.x'] - window_size/2,
cell['global.x'] + window_size/2,
cell['global.y'] - window_size/2,
cell['global.y'] + window_size/2]
md[['center_x','center_y']] = md[['global.x','global.y']]
md_sub = deepcopy(md[(md.center_x > window[0]) &
(md.center_x < window[1]) &
(md.center_y > window[2]) &
(md.center_y < window[3])])
parquet_file = pq.ParquetFile(cell_boundaries_file)
parquet_file
<pyarrow.parquet.ParquetFile at 0x28d2c822470>
parquet_file.num_row_groups
1
rg = parquet_file.read_row_group(0)
index_ = [x in md_sub.index for x in pd.array(rg['id'])]
rg = rg.filter(index_)
df = rg.to_pandas()
cellBoundaries = df
cellBoundaries['geometry'] = cellBoundaries['geometry'].apply(wkb.loads)
cellBoundaries = gpd.GeoDataFrame(cellBoundaries, geometry='geometry')
cellBoundaries.shape
(21535, 14)
currentCells = cellBoundaries['geometry']
global_bounds = currentCells.total_bounds
cellBoundaries = cellBoundaries.merge(md_sub, left_on = 'id', right_on = 'name')
cellBoundaries.to_parquet('./temp_cellBoundaries_with_md_sub.parquet')
%%time
trx = (pl
.scan_csv(transcripts_file,
rechunk=False)
.filter((pl.col('global_x') > global_bounds[0]) &
(pl.col('global_x') < global_bounds[2]) &
(pl.col('global_y') > global_bounds[1]) &
(pl.col('global_y') < global_bounds[3]))
.select(['global_x','global_y','global_z','barcode_id'])
).collect().to_pandas()
CPU times: total: 44.2 s Wall time: 12.1 s
min(trx.barcode_id)
0
trx_id = pd.read_csv('./barcodes_per_feature.csv',chunksize=10)
trx_id = next(trx_id)
key = pd.DataFrame({'gene': trx_id.columns[1:],
'barcode_id': range(0,216)})
trx = trx.merge(key, on = 'barcode_id')
currentCells = currentCells.reset_index(drop = True)
polygon_data3d = []
cellBoundaries['cluster_grouping'] = cellBoundaries['cell.type']
cellBoundaries.head()
| id | fov_x | area | x | y | z | center_z | geometry | center_x_x | center_y_x | ... | doublet.score | fov_y | volume | nFeature_RNA | nCount_RNA | cell.type | cluster | center_x_y | center_y_y | cluster_grouping | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 02dd707d48394665bd357f0d7db03183 | 22 | 19.678364 | 1810.0 | 618.0 | 9 | 10.0 | POLYGON ((-53.660 210.850, -53.660 210.704, -5... | -55.868503 | 209.387494 | ... | 0.09188 | 22 | 1692.271273 | 56 | 581 | INC | I2 | -55.868503 | 209.387494 | INC |
| 1 | 02dd707d48394665bd357f0d7db03183 | 22 | 53.622804 | 1810.0 | 618.0 | 10 | 11.0 | POLYGON ((-51.759 211.435, -51.759 211.289, -5... | -55.868503 | 209.387494 | ... | 0.09188 | 22 | 1692.271273 | 56 | 581 | INC | I2 | -55.868503 | 209.387494 | INC |
| 2 | 02dd707d48394665bd357f0d7db03183 | 22 | 73.275073 | 1810.0 | 618.0 | 11 | 12.0 | POLYGON ((-51.174 212.312, -51.174 212.166, -5... | -55.868503 | 209.387494 | ... | 0.09188 | 22 | 1692.271273 | 56 | 581 | INC | I2 | -55.868503 | 209.387494 | INC |
| 3 | 02dd707d48394665bd357f0d7db03183 | 22 | 92.713451 | 1810.0 | 618.0 | 12 | 13.0 | POLYGON ((-50.735 212.312, -50.735 212.166, -5... | -55.868503 | 209.387494 | ... | 0.09188 | 22 | 1692.271273 | 56 | 581 | INC | I2 | -55.868503 | 209.387494 | INC |
| 4 | 02dd707d48394665bd357f0d7db03183 | 22 | 104.314878 | 1810.0 | 618.0 | 13 | 14.0 | POLYGON ((-49.711 211.435, -49.711 211.289, -4... | -55.868503 | 209.387494 | ... | 0.09188 | 22 | 1692.271273 | 56 | 581 | INC | I2 | -55.868503 | 209.387494 | INC |
5 rows × 31 columns
for idx in range(0, len(currentCells)):
coordinates =[]
cell = currentCells[idx]
# for polygon in cell.geoms:
coordinates.extend(list(cell.exterior.coords))
z_level = cellBoundaries.iloc[idx].z
coordinates = [list(ele) for ele in coordinates]
coordinates = [[round(x,2) for x in pair] for pair in coordinates]
coordinates3d = [list(list(t) + [float(z_level * z_distance)]) for t in coordinates]
inst_poly3d = {'coordinates': coordinates3d,
'name': str(cellBoundaries.iloc[idx].id),
'z': float(z_level * z_distance),
'cell_type': str(cellBoundaries.iloc[idx].cluster_grouping),
'z_idx': float(z_level)
}
polygon_data3d.append(inst_poly3d)
trx['cell_id'] = -1
trx
| global_x | global_y | global_z | barcode_id | gene | cell_id | |
|---|---|---|---|---|---|---|
| 0 | -53.321777 | 147.97624 | 1.0 | 0 | Slc17a6 | -1 |
| 1 | -44.658420 | 156.94585 | 1.0 | 0 | Slc17a6 | -1 |
| 2 | -43.245136 | 158.45552 | 1.0 | 0 | Slc17a6 | -1 |
| 3 | -41.824020 | 166.70111 | 1.0 | 0 | Slc17a6 | -1 |
| 4 | -102.173310 | 171.07602 | 1.0 | 0 | Slc17a6 | -1 |
| ... | ... | ... | ... | ... | ... | ... |
| 1401810 | 92.265366 | 218.00143 | 176.0 | 158 | Blank-3 | -1 |
| 1401811 | 66.759600 | 164.93889 | 177.0 | 158 | Blank-3 | -1 |
| 1401812 | 114.019980 | 289.76685 | 188.0 | 158 | Blank-3 | -1 |
| 1401813 | 20.250843 | 179.71475 | 196.0 | 158 | Blank-3 | -1 |
| 1401814 | 32.352300 | 181.94100 | 199.0 | 158 | Blank-3 | -1 |
1401815 rows × 6 columns
trx_no_blanks = deepcopy(trx[~trx['gene'].str.contains('Blank')])
trx.shape
(1401815, 6)
trx_no_blanks.shape
(1396002, 6)
trx_no_blanks.head()
| global_x | global_y | global_z | barcode_id | gene | cell_id | |
|---|---|---|---|---|---|---|
| 0 | -53.321777 | 147.97624 | 1.0 | 0 | Slc17a6 | -1 |
| 1 | -44.658420 | 156.94585 | 1.0 | 0 | Slc17a6 | -1 |
| 2 | -43.245136 | 158.45552 | 1.0 | 0 | Slc17a6 | -1 |
| 3 | -41.824020 | 166.70111 | 1.0 | 0 | Slc17a6 | -1 |
| 4 | -102.173310 | 171.07602 | 1.0 | 0 | Slc17a6 | -1 |
df_obs = deepcopy(trx_no_blanks)
df_obs.columns = ['x','y','z','barcode_id','name','partition']
df_obs['x'] = df_obs['x'].round(1)
df_obs['y'] = df_obs['y'].round(1)
df_obs['partition'] = df_obs['partition'].astype(float)
scatter_data = df_obs.to_dict('records')
cats = df_obs.name.unique().tolist()
colors = []
for inst_index in range(len(cats)):
# if 'Blank' in cats[inst_index]:
# colors.append('#d3d3d3')
# else:
mod_index = inst_index % 60
if mod_index < 20:
color_rgba = plt.cm.tab20(mod_index % 20)
elif mod_index < 40:
color_rgba = plt.cm.tab20b(mod_index % 20)
elif mod_index < 60:
color_rgba = plt.cm.tab20c(mod_index % 20)
inst_color = '#{:02x}{:02x}{:02x}'.format(int(color_rgba[0]*255),
int(color_rgba[1]*255),
int(color_rgba[2]*255))
colors.append(inst_color)
cat_colors = dict(zip(cats, colors))
# cell colors
###################
cats = cellBoundaries.cluster_grouping.unique().tolist()
colors = []
for inst_index in range(len(cats)):
mod_index = inst_index % 60
if mod_index < 20:
color_rgba = plt.cm.tab20(mod_index % 20)
elif mod_index < 40:
color_rgba = plt.cm.tab20(mod_index % 20)
elif mod_index < 60:
color_rgba = plt.cm.tab20(mod_index % 20)
inst_color = '#{:02x}{:02x}{:02x}'.format( int(color_rgba[0]* 255), int(color_rgba[1]* 255), int(color_rgba[2]* 255))
colors.append(inst_color)
cell_colors = dict(zip(cats, colors))
x_offset = float(global_bounds[0])
y_offset = float(global_bounds[1])
zip_polygon_data3d = cmt.viz.json_zip(polygon_data3d)
zip_scatter_data = cmt.viz.json_zip(scatter_data)
bounds = global_bounds
fov_inputs = {
# 'polygon_data3d': polygon_data3d,
'zip_polygon_data3d': zip_polygon_data3d,
'zip_scatter_data': zip_scatter_data,
# 'image': image_url,
# 'image_bounds': image_bounds,
'center_x': np.mean([bounds[0], bounds[2]]),
'center_y': np.mean([bounds[1], bounds[3]]),
'zoom': -.5,
'min_zoom': -3,
'height': 800,
'cat_colors': cat_colors,
'cell_colors': cell_colors,
'ini_opacity':1,
'scale_z': 1
# 'init_gene': init_gene,
}
bounds
array([-105.60812654, 145.46162128, 115.75187066, 361.5968692 ])
embed('@cellvisulization/2-3-3plus-deck-gl-image-polygon-scatter-3d',
cells=['dashboard'],
inputs = fov_inputs)